library(readr)
library(stats)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
# Loading Dataset
titanic=read_csv('C:/Users/Admin/Desktop/titanic.csv')
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Checking the structure of dataset
str(titanic)
## spc_tbl_ [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr [1:891] "male" "female" "female" "female" ...
##  $ Age        : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr [1:891] NA "C85" NA "C123" ...
##  $ Embarked   : chr [1:891] "S" "C" "S" "S" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_double(),
##   ..   Survived = col_double(),
##   ..   Pclass = col_double(),
##   ..   Name = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_double(),
##   ..   SibSp = col_double(),
##   ..   Parch = col_double(),
##   ..   Ticket = col_character(),
##   ..   Fare = col_double(),
##   ..   Cabin = col_character(),
##   ..   Embarked = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Summary statistics
summary(titanic)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
head(titanic)
## # A tibble: 6 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl>    <dbl>  <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
## 1           1        0      3 Braund… male     22     1     0 A/5 2…  7.25 <NA> 
## 2           2        1      1 Cuming… fema…    38     1     0 PC 17… 71.3  C85  
## 3           3        1      3 Heikki… fema…    26     0     0 STON/…  7.92 <NA> 
## 4           4        1      1 Futrel… fema…    35     1     0 113803 53.1  C123 
## 5           5        0      3 Allen,… male     35     0     0 373450  8.05 <NA> 
## 6           6        0      3 Moran,… male     NA     0     0 330877  8.46 <NA> 
## # ℹ 1 more variable: Embarked <chr>
# converting the numeric survive indicator to factor/categorical
titanic$Survived=ifelse(titanic$Survived==1,'Yes','No')
titanic$Survived=as.factor(titanic$Survived)

head(titanic)
## # A tibble: 6 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl> <fct>     <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
## 1           1 No            3 Braund… male     22     1     0 A/5 2…  7.25 <NA> 
## 2           2 Yes           1 Cuming… fema…    38     1     0 PC 17… 71.3  C85  
## 3           3 Yes           3 Heikki… fema…    26     0     0 STON/…  7.92 <NA> 
## 4           4 Yes           1 Futrel… fema…    35     1     0 113803 53.1  C123 
## 5           5 No            3 Allen,… male     35     0     0 373450  8.05 <NA> 
## 6           6 No            3 Moran,… male     NA     0     0 330877  8.46 <NA> 
## # ℹ 1 more variable: Embarked <chr>
# Converting categorical columns from int to factors
titanic$Pclass=as.factor(titanic$Pclass)
titanic$SibSp=as.factor(titanic$SibSp)
titanic$Parch=as.factor(titanic$Parch)

head(titanic)
## # A tibble: 6 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl> <fct>    <fct>  <chr>   <chr> <dbl> <fct> <fct> <chr>  <dbl> <chr>
## 1           1 No       3      Braund… male     22 1     0     A/5 2…  7.25 <NA> 
## 2           2 Yes      1      Cuming… fema…    38 1     0     PC 17… 71.3  C85  
## 3           3 Yes      3      Heikki… fema…    26 0     0     STON/…  7.92 <NA> 
## 4           4 Yes      1      Futrel… fema…    35 1     0     113803 53.1  C123 
## 5           5 No       3      Allen,… male     35 0     0     373450  8.05 <NA> 
## 6           6 No       3      Moran,… male     NA 0     0     330877  8.46 <NA> 
## # ℹ 1 more variable: Embarked <chr>
sum(is.na(titanic$Age))
## [1] 177
# from the above line we can see that 'Age' column has 177 missing values
# We can choose either mean or median method to fillin the values
# Mean might not give accurate results, can fill the values with median.

titanic$Age[is.na(titanic$Age)] <- round(median(titanic$Age, 
                                                na.rm = TRUE))

sum(is.na(titanic$Age))
## [1] 0
# Now there are no missing values in Age column
# Exploratory Data Analysis

#Univariate Analysis

#Question
#How any passengers are travelling in each class?

ggplot(data=titanic, aes(x=Pclass, fill = Pclass)) + 
  geom_bar(position = "dodge") + 
  geom_text(stat='count', aes(label=..count..), position = position_dodge(0.9),vjust=-0.2) +
  ylab("Number of Passengers")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# We can see most of the passengers are from class 3 (cheapest of all)
# Suprisingly there are more passengers in Class-1, than compared to Class-2
# Question
# How many were survived?

ggplot(data=titanic, aes(x=Survived, fill = Survived)) + 
  geom_bar(position = "dodge") + 
  geom_text(stat='count', aes(label=..count..), position = position_dodge(0.9),vjust=-0.2) +
  ylab("Number of Passengers")

# Here we can see there are 342 that are survived, and 549 passengers who lost their lives
#Question
# How is the price ranged?
ggplot(data=titanic, aes(x=Fare,)) + 
  geom_histogram(binwidth = 15) +
  xlab("Fare")

# The tickets fare are consistent with the ticket class for sure, as the highest number of ticket purchased is the cheapest one offered to board the Titanic.
# BiVariate Analysis
# Question
# Which class passengers has high survival chance
ggplot(titanic, aes(x=Pclass,fill=Survived))+ geom_bar(position = "dodge") + geom_text(stat='count',aes(label=..count..),position = position_dodge(0.9),vjust=-0.2) +
  ylab("Number of Passengers") + xlab("Passenger Class")

# From the chart we can conclude that people who paid more ie, Class-1 had much better chance of survival as compared to others.
# Question
# what ticket was selected by what age of passengers?
ggplot(titanic) + geom_freqpoly(mapping = aes(x = Age, color = Pclass), binwidth = 2.5) +
  ylab("Frequency")

# HEre, we can see that most of the passengers irrespective of ticket class are almost of similar age (approx 30)
# Age Distribution by passenger class and sex
ggplot(titanic, aes(x=factor(Pclass), y=Age, fill=Sex)) +
  geom_boxplot() +
  facet_grid(Sex ~ .) +
  scale_fill_manual(values=c("lightblue", "pink"), name="Sex") +
  labs(title="Age Distribution by Passenger Class and Sex",
       x="Passenger Class",
       y="Age") +
  theme_minimal()

# This faceted boxplot also shows almost similar that most of the passengers
# were from age 20-40 including all the ticket classes.
# Boxplot of fare and survived status
boxplot(Fare ~ Survived, data=titanic, main="Fare by Survival Status", xlab="Survived (0 = No, 1 = Yes)", ylab="Fare", col="lightgreen")

# Comparison of Age and Fare
plot(titanic$Age, titanic$Fare, main="Age vs. Fare", xlab="Age", ylab="Fare", col="blue")

# Scatterplot matrix for the numerical columns in the data
numeric_vars <- titanic[, c("Survived", "Pclass", "Age", "SibSp", "Parch", "Fare")]
pairs(numeric_vars)

# Question
# What percent of people were survived that were embarked at different ports
ggplot(titanic, aes(x=factor(Survived), fill=factor(Survived))) +
  geom_bar() +
  facet_grid(Pclass ~ Embarked) +
  scale_fill_manual(values=c("lightblue", "pink"), name="Survived") +
  labs(title="Survival Counts by Passenger Class and Embarked Port",
       x="Survived (0 = No, 1 = Yes)",
       y="Count") +
  theme_minimal()

# HEre we can conclude that passengers that embarked at port Southampton wer the ones who survivied more and also they are the ones who lost their lives
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#3D scatterplot using plotly
plot_ly(data = titanic, x = ~Age, y = ~Fare, z = ~Survived, color = ~Survived, colors = c("pink", "lightblue")) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = "Age"),
                      yaxis = list(title = "Fare"),
                      zaxis = list(title = "Survived")))